package edu.nlplab.mvc;

import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;

import org.springframework.stereotype.Controller;
import org.springframework.ui.ModelMap;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.servlet.ModelAndView;

import javax.annotation.Resource;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpSession;
import java.util.concurrent.Callable;

@Controller
@RequestMapping(value = "/preprocess")
public class PreProcessController {

    @Resource
    private HttpServletRequest request;

    @RequestMapping(value = "/beforepreprocess", method = RequestMethod.GET)
    public String index(ModelMap model) {
        return "beforePreProcess";
    }

    @RequestMapping(value = "/segment", method = RequestMethod.GET, produces = "application/json")
    @ResponseBody
    public Callable<Map<String, Object>> segment(final ModelMap model, final HttpSession session) {
        return new Callable<Map<String, Object>>() {
            @Override
            public Map<String, Object> call() throws Exception {
                Map<String, Object> map = null;
                try {
                    Thread.sleep(3 * 1000L);
                    // 文件合并
                    File mergedFile = mergeFiles(session.getId());
                    map = SegmentWords(mergedFile);
                    countFrequency();
                    countDF();
                    //model.addAttribute("msg1", "Segment success, 词个数为：" + map.get("wordcount"));
                    //model.addAttribute("msg2", "分词用时为：" + map.get("costtime") + "秒");
                } catch (IOException e) {
                    model.addAttribute("msg", "Segment Failed");
                }
                return map;
            }
        };
    }

    @RequestMapping(value = "/callable2", method = RequestMethod.GET)
    public Callable<ModelAndView> callable2() {
        return new Callable<ModelAndView>() {
            public ModelAndView call() throws Exception {
                Thread.sleep(5 * 1000L); //暂停2秒
                ModelAndView mv = new ModelAndView("msg");
                mv.addObject("msg", "hello callable");
                return mv;
            }
        };
    }

    // 合并上传文件夹下所有上传文件
    private File mergeFiles(String sessionID) throws IOException {
        // 获取当前用户上传文件所在位置
        String prePath = request.getSession().getServletContext().getRealPath("/") + "static/uploads/";
        SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
        String path = prePath + df.format(new Date()) + "/" + sessionID + "/";
        File preProcessDir = new File(path + "PreProcess/");
        if (!preProcessDir.exists()) {
            preProcessDir.mkdirs();
        }
        String mergeFileName = "merge.txt";
        // 合并文件位置
        File mergeFile = new File(preProcessDir, mergeFileName);
        OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(mergeFile), "UTF-8");
        // 遍历上传文件
        File root = new File(path);
        File[] files = root.listFiles();
        if (files != null) {
            for (File file : files) {
                if (file.isDirectory())
                    continue;
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK"));
                String line = "_*_" + file.getName() + "_*_";
                osw.write(line);
                while ((line = br.readLine()) != null) {
                    if (line.length() == 0 || line.equals(""))
                        continue;
                    line = line.replace((char) 12288, ' ');//全角空格
                    line = line.replace((char) 32, ' ');
                    line = line.replaceAll("\t", "");
                    line = line.replaceAll(" ", "");
                    line = line.trim();
                    line = line.replaceAll("\0", "");
                    line = line.replaceAll("&nbsp;", "");
                    line = line.replaceAll("&nbsp", "");
                    osw.write(line);
                }
                osw.write("\r\n");
                br.close();
            }
            osw.close();
        }

        return mergeFile;
    }

    private Map<String,List<String>> allWordsMap=new HashMap<String, List<String>>();//存储所有单词和单词所在文件
    String preProcessPath="";//预处理结果文件夹路径
    private int N=0;//N表示文档个数

    private Map<String, Object> SegmentWords(File mergedFile) throws IOException
    {
        NotionalTokenizer notionalToken = new NotionalTokenizer();
        preProcessPath = mergedFile.getParent();
        File segmentedFile = new File(preProcessPath, "segment.txt");
        OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(segmentedFile), "UTF-8");
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mergedFile), "UTF-8"));
            long start = System.currentTimeMillis();
            long wordCount = 0;
            String line = "";
            String preStr = "";
            List<Term> termList = null;
            Map<String, Object> map = new HashMap<String, Object>();

            while ((line = br.readLine()) != null) {
                int pos = line.indexOf("_*_", 3);
                preStr = line.substring(0, pos + 3);
                line = line.substring(pos+3).trim();
                termList = notionalToken.segment(line);
                termList = filterListByPOS(termList);
                osw.write(preStr + "\r\n");
                osw.write(mergeList(termList));
                allWordsMap.put(preStr,getWordList(termList));
                wordCount += termList.size();
                N++;
        }

        osw.write("_*_end_*_\r\n");
        br.close();
        osw.close();
        map.put("wordcount", wordCount);
        map.put("costtime", (System.currentTimeMillis() - start));
        return map;
    }

    /*
     根据词性过滤词汇
     */
    private List<Term> filterListByPOS(List<Term> list) {
        char[] filterList = {'n', 'a', 'v', 'r', 'g'};
        List<Term> newList = new ArrayList<Term>();
        for (Term term : list) {
            for (char chr : filterList) {
                if (term.nature.toString().indexOf(chr) == 0) {
                    newList.add(term);
                    break;
                }
            }
        }
        return newList;
    }


    private String mergeList(List<Term> list) {
        StringBuilder sb = new StringBuilder();
        for (Term term : list) {
            sb.append(term.toString().trim() + "\r\n");
        }
        return sb.toString();
    }

    //-------------------------------------------------nhy------------------------------------------------------------
    //获取单词列表，去除词性
    private List<String> getWordList(List<Term> list) {
        List<String> wordList=new ArrayList<String>();
        for (Term term : list) {
            wordList.add(term.toString().substring(0,term.toString().indexOf("/")));
        }
        return wordList;
    }

    private Map<String,Integer> wordsCountMap=new HashMap<String,Integer>();//存储单词和单词的个数，单词为key

    Iterator<String> keyIterator;//allWordsMap中key的iterator
    //计算单词频数
    private void countFrequency() throws IOException {
        keyIterator=allWordsMap.keySet().iterator();
        while(keyIterator.hasNext())
        {
            String fileName=keyIterator.next();
            List<String> perFileWords=allWordsMap.get(fileName);
            for(int i=0;i<perFileWords.size();i++)
            {
                String word=perFileWords.get(i);
                if(wordsCountMap.get(word)==null)
                    wordsCountMap.put(word, 1);
                else
                    wordsCountMap.put(word, wordsCountMap.get(word)+1);
            }
        }

        File wordsFrequencyFile = new File(preProcessPath, "wordsFrequency.txt");
        OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(wordsFrequencyFile), "UTF-8");
        System.out.println(wordsCountMap.size());

        Iterator<Map.Entry<String, Integer>> it = wordsCountMap.entrySet().iterator();

        while(it.hasNext()){
            Map.Entry<String, Integer> entry=it.next();
            String key=entry.getKey();
            int value=entry.getValue();

            if (value<=2)
                it.remove();
            else
                osw.write(key+":"+value+"\n");
        }

        System.out.println(wordsCountMap.size());
        osw.close();
    }

    private void countDF() throws IOException {
        float DF[]=new float[wordsCountMap.size()];

        File DfFile = new File(preProcessPath, "DF.txt");
        OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(DfFile), "UTF-8");

        Iterator<String> wordIterator=wordsCountMap.keySet().iterator();
        for(int i=0;i<wordsCountMap.size();i++)
        {
            String currentWord=wordIterator.next();
            keyIterator=allWordsMap.keySet().iterator();

            while(keyIterator.hasNext())
            {
                String fileName = keyIterator.next();

                List<String> perFileWords = allWordsMap.get(fileName);
                if (perFileWords.contains(currentWord))
                {
                    DF[i]=DF[i]+1;
                }
            }
            osw.write(currentWord + ":" + DF[i]+"\n");
        }
        osw.close();
    }
}
